Análisis exploratorio

Importación de paquetes y carga de .csv

In [82]:
import pandas as pd
import plotly.graph_objects as go
from plotly import offline
offline.init_notebook_mode()

Y_NAME = 'IsCanceled'
CSV_PATH = 'hotusa_cancellations.csv'


# Eliminación de espacios en el archivo .csv para parsear correctamente
# las variables
with open(CSV_PATH, 'r') as filein:
    lines = filein.readlines()

new_lines = [line.replace(' ', '') for line in lines]

with open(CSV_PATH, 'w') as fileout:
    fileout.writelines(new_lines)
In [83]:
data = pd.read_csv(CSV_PATH, parse_dates=['ReservationStatusDate','ArrivalDate'])
data
Out[83]:
Unnamed: 0 HotelId ReservationStatusDate ArrivalDate LeadTime StaysInWeekendNights StaysInWeekNights Adults Children CustomerType ADR Meal Country Company ReservedRoomType IsRepeatedGuest IsCanceled
0 10043 1 2016-12-15 2017-02-10 238 0 2 2 0 Transient 54.00 HB PRT NaN A False True
1 19043 1 2015-12-07 2016-05-27 174 2 5 2 1 Transient 108.02 BB PRT NaN F False True
2 3509 1 2015-08-24 2015-12-20 260 4 9 1 0 Transient 47.54 BB PRT NaN D False True
3 5012 1 2016-01-05 2016-04-12 168 0 2 2 0 Transient 86.00 HB PRT NaN A False True
4 17991 1 2015-11-11 2015-11-11 1 0 1 1 0 Transient 35.00 BB PRT 88.0 A True True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
33683 40055 1 2017-09-10 2017-08-31 212 2 8 2 1 Transient 89.75 BB GBR NaN A False False
33684 40056 1 2017-09-10 2017-08-30 169 2 9 2 0 Transient-Party 202.27 BB IRL NaN E False False
33685 40057 1 2017-09-12 2017-08-29 204 4 10 2 0 Transient 153.57 BB IRL NaN E False False
33686 40058 1 2017-09-14 2017-08-31 211 4 10 2 0 Contract 112.80 HB GBR NaN D False False
33687 40059 1 2017-09-14 2017-08-31 161 4 10 2 0 Transient 99.06 HB DEU NaN A False False

33688 rows × 17 columns

Exploración básica del DataFrame

In [84]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33688 entries, 0 to 33687
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Unnamed: 0             33688 non-null  int64         
 1   HotelId                33688 non-null  int64         
 2   ReservationStatusDate  33688 non-null  datetime64[ns]
 3   ArrivalDate            33688 non-null  datetime64[ns]
 4   LeadTime               33688 non-null  int64         
 5   StaysInWeekendNights   33688 non-null  int64         
 6   StaysInWeekNights      33688 non-null  int64         
 7   Adults                 33688 non-null  int64         
 8   Children               33688 non-null  int64         
 9   CustomerType           33688 non-null  object        
 10  ADR                    33688 non-null  float64       
 11  Meal                   33688 non-null  object        
 12  Country                33256 non-null  object        
 13  Company                2869 non-null   float64       
 14  ReservedRoomType       33688 non-null  object        
 15  IsRepeatedGuest        33688 non-null  bool          
 16  IsCanceled             33688 non-null  bool          
dtypes: bool(2), datetime64[ns](2), float64(2), int64(7), object(4)
memory usage: 3.9+ MB
In [85]:
data.describe()
Out[85]:
Unnamed: 0 HotelId ReservationStatusDate ArrivalDate LeadTime StaysInWeekendNights StaysInWeekNights Adults Children ADR Company
count 33688.000000 33688.0 33688 33688 33688.000000 33688.000000 33688.000000 33688.000000 33688.000000 33688.000000 2869.000000
mean 22133.238898 1.0 2016-08-04 09:28:56.309665280 2016-08-11 17:20:07.124198656 85.972839 1.163649 3.074121 1.850303 0.117965 92.892934 240.726734
min 0.000000 1.0 2014-11-18 00:00:00 2015-07-01 00:00:00 0.000000 0.000000 0.000000 0.000000 0.000000 -6.380000 6.000000
25% 14232.750000 1.0 2016-02-05 00:00:00 2016-02-12 00:00:00 7.000000 0.000000 1.000000 2.000000 0.000000 49.500000 154.000000
50% 23197.500000 1.0 2016-08-08 00:00:00 2016-08-15 00:00:00 47.000000 1.000000 3.000000 2.000000 0.000000 74.000000 223.000000
75% 31634.250000 1.0 2017-02-17 00:00:00 2017-02-26 00:00:00 145.000000 2.000000 5.000000 2.000000 0.000000 121.250000 329.000000
max 40059.000000 1.0 2017-09-14 00:00:00 2017-08-31 00:00:00 737.000000 19.000000 50.000000 27.000000 3.000000 508.000000 543.000000
std 11298.730622 0.0 NaN NaN 95.587656 1.147904 2.465181 0.537657 0.421017 60.278815 125.950403

Valores perdidos y eliminación de variables

In [86]:
display(data.isna().sum() / len(data))

# Eliminación de variable 'Company' por tner el 91% de valores nulos.
# Eliminación del primer campo sin identificación o referencias en la documentación
# Eliminación del HotelId, ya que es una constante

data.drop(['Company', 'Unnamed: 0', 'HotelId'], axis=1, inplace=True)
Unnamed: 0               0.000000
HotelId                  0.000000
ReservationStatusDate    0.000000
ArrivalDate              0.000000
LeadTime                 0.000000
StaysInWeekendNights     0.000000
StaysInWeekNights        0.000000
Adults                   0.000000
Children                 0.000000
CustomerType             0.000000
ADR                      0.000000
Meal                     0.000000
Country                  0.012824
Company                  0.914836
ReservedRoomType         0.000000
IsRepeatedGuest          0.000000
IsCanceled               0.000000
dtype: float64

Exploración por variable

ReservationStatusDate

In [87]:
aux = data.sort_values(by='ReservationStatusDate')
aux = aux[['ReservationStatusDate', Y_NAME]].groupby('ReservationStatusDate')[Y_NAME].sum()

fig = go.Figure([
    go.Bar(
        x=aux.index,
        y=aux
    )
])
fig.update_layout(title='Recuento de cancelaciones por fecha de reserva',
                  yaxis_title='Recuento',
                  template='plotly_white')
fig.show()

ArrivalDate

In [88]:
aux = data.sort_values(by='ArrivalDate')
aux = aux[['ArrivalDate', Y_NAME]].groupby('ArrivalDate')[Y_NAME].sum()

fig = go.Figure([
    go.Bar(
        x=aux.index,
        y=aux
    )
])
fig.update_layout(title='Recuento de cancelaciones por fecha de llegada',
                  yaxis_title='Recuento',
                  template='plotly_white')
fig.show()

LeadTime

In [89]:
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[data[Y_NAME], 'LeadTime'], name='Cancelado'))
fig.add_trace(go.Box(x=data.loc[~data[Y_NAME], 'LeadTime'], name='No Cancelado'))
fig.update_layout(template='plotly_white', xaxis_title='Días de antelación de reserva')
fig.show()

StaysInWeekendNights

In [90]:
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[data[Y_NAME], 'StaysInWeekendNights'], name='Cancelado'))
fig.add_trace(go.Box(x=data.loc[~data[Y_NAME], 'StaysInWeekendNights'], name='No Cancelado'))
fig.update_layout(template='plotly_white', xaxis_title='Número de noches en fin de semana')
fig.show()

StaysInWeekNights

In [91]:
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[data[Y_NAME], 'StaysInWeekNights'], name='Cancelado'))
fig.add_trace(go.Box(x=data.loc[~data[Y_NAME], 'StaysInWeekNights'], name='No Cancelado'))
fig.update_layout(template='plotly_white', xaxis_title='Número de noches en semana')
fig.show()

Adults

In [92]:
cross = pd.crosstab(data[Y_NAME], data['Adults'])
pd.DataFrame({
        'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
        'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Out[92]:
relative total
Adults
26 1.000000 3
5 1.000000 1
6 1.000000 1
20 1.000000 1
27 1.000000 1
2 0.156840 26001
4 0.153846 26
3 0.138889 1224
1 0.076492 6419
0 0.000000 11

Children

In [93]:
cross = pd.crosstab(data[Y_NAME], data['Children'])
pd.DataFrame({
        'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
        'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Out[93]:
relative total
Children
2 0.263468 1188
1 0.175789 1553
0 0.134618 30932
3 0.000000 15

CustomerType

In [94]:
cross = pd.crosstab(data[Y_NAME], data['CustomerType'])
pd.DataFrame({
        'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
        'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Out[94]:
relative total
CustomerType
Transient 0.162181 24818
Transient-Party 0.093380 6918
Group 0.052239 268
Contract 0.038599 1684

ADR

In [95]:
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[data[Y_NAME], 'ADR'], name='Cancelado'))
fig.add_trace(go.Box(x=data.loc[~data[Y_NAME], 'ADR'], name='No Cancelado'))
fig.update_layout(template='plotly_white', xaxis_title='Precio medio por noche')
fig.show()

Meal

In [96]:
cross = pd.crosstab(data[Y_NAME], data['Meal'])
pd.DataFrame({
        'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
        'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Out[96]:
relative total
Meal
FB 0.367886 492
HB 0.166944 6601
BB 0.131072 25505
Undefined 0.122266 1006
SC 0.011905 84

Country

In [97]:
# Ordenado por frecuencia de cancelación
cross = pd.crosstab(data[Y_NAME], data['Country'])
pd.DataFrame({
        'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
        'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Out[97]:
relative total
Country
GEO 0.500000 8
ARE 0.500000 6
MDV 0.500000 4
PAK 0.500000 4
KOR 0.333333 9
... ... ...
SYR 0.000000 1
TGO 0.000000 1
UGA 0.000000 1
UZB 0.000000 1
ZMB 0.000000 1

118 rows × 2 columns

In [98]:
# Ordenado por frecuencia de país
cross = pd.crosstab(data[Y_NAME], data['Country'])
pd.DataFrame({
        'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
        'total': cross.loc[True] + cross.loc[False]
}).sort_values(['total', 'relative'], ascending=False)
Out[98]:
relative total
Country
PRT 0.238551 13385
GBR 0.058347 6290
ESP 0.105157 3471
IRL 0.098284 1923
FRA 0.060403 1490
... ... ...
SYR 0.000000 1
TGO 0.000000 1
UGA 0.000000 1
UZB 0.000000 1
ZMB 0.000000 1

118 rows × 2 columns

ReservedRoomType

In [99]:
cross = pd.crosstab(data[Y_NAME], data['ReservedRoomType'])
pd.DataFrame({
        'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
        'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Out[99]:
relative total
ReservedRoomType
H 0.259875 481
G 0.227818 1251
L 0.200000 5
C 0.154058 727
E 0.145828 4183
A 0.138816 19760
D 0.127429 6278
F 0.074000 1000
B 0.000000 3

IsRepeatedGuest

In [100]:
cross = pd.crosstab(data[Y_NAME], data['IsRepeatedGuest'])
pd.DataFrame({
        'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
        'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Out[100]:
relative total
IsRepeatedGuest
False 0.146848 31965
True 0.032501 1723

IsCancelled

In [101]:
fig = go.Figure([
    go.Bar(
        x=data[Y_NAME].value_counts().index,
        y=data[Y_NAME].value_counts()
    )
])
fig.update_layout(title=f'Barras de frecuencia para { Y_NAME }',
                 yaxis_title='Recuento')
fig.show()